# Molecular Oncology 2020: Metabolism-associated molecular classiﬁcation of hepatocellular carcinoma

#---------------#
# 输入数据获取  #
#---------------#

# 设置工作路径
workdir <- "E:/IGBMC/externalproject/ExternalProject/Giotto/CESC/InputData"; setwd(workdir)

# 加载R包
library(impute)

# 自定义函数
## 自定义转换FPKM值为TPM
fpkmToTpm <- function(fpkm)
{
  exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}

# 1.TCGA-CESC宫颈癌表达谱
fpkm <- read.table("TCGA-CESC.htseq_fpkm.tsv.gz", sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
fpkm <- 2^fpkm - 1 # 还原log2转化

# 将FPKM转为TPM
tpm <- apply(fpkm, 2, fpkmToTpm)
tpm <- as.data.frame(log2(tpm + 1))
rm(fpkm); gc()

# 匹配基因名
Ginfo <- read.delim("gencode.v22.annotation.gene.probeMap",row.names = 1,sep = "\t",check.names = F,stringsAsFactors = F,header = T)
comgene <- intersect(rownames(Ginfo), rownames(tpm))
Ginfo <- Ginfo[comgene,]
tpm <- tpm[comgene,]
identical(rownames(tpm), rownames(Ginfo))
tpm$Gene <- Ginfo[rownames(tpm),"gene"]
tpm <- as.data.frame(apply(tpm[,setdiff(colnames(tpm), "Gene")], 2, function(x) tapply(x, INDEX = factor(tpm$Gene), FUN = median, na.rm = TRUE))) # take median value for multiple match

# 提出临床信息
cesc.sinfo <- read.delim("TCGA-CESC.GDC_phenotype.tsv.gz",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) # 317
cesc.sinfo <- cesc.sinfo[,c("age_at_initial_pathologic_diagnosis","neoplasm_histologic_grade","pathologic_T","pathologic_N","pathologic_M","clinical_stage","lymphovascular_invasion_indicator","menopause_status","primary_diagnosis.diagnoses")]
colnames(cesc.sinfo) <- c("age","grade","tstage","nstage","mstage","pstage","lymphovascular_invasion","menopause_status","histology")
cesc.sinfo <- cesc.sinfo[substr(rownames(cesc.sinfo), 14,16) == "01A",] # 299 primary tumors

# 读取泛癌生存数据并提取皮肤癌SKCM
pansurv <- read.delim("pancancerSurvivalData_XLu.txt",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T)
pansurv <- pansurv[which(pansurv$type == "CESC"),]

# 匹配表达谱
samID <- substr(rownames(cesc.sinfo), 1, 12)
matchID <- match(pansurv$bcr_patient_barcode,samID)
pansurv$samID <- rownames(cesc.sinfo)[matchID]
sinfo <- pansurv[,c("OS","OS.time","PFI","PFI.time","samID")]
sinfo <- as.data.frame(na.omit(sinfo))
rownames(sinfo) <- sinfo$samID

cesc.sinfo$OS <- sinfo[rownames(cesc.sinfo),"OS"]
cesc.sinfo$OS.time <- sinfo[rownames(cesc.sinfo),"OS.time"]/30
cesc.sinfo$PFI <- sinfo[rownames(cesc.sinfo),"PFI"]
cesc.sinfo$PFI.time <- sinfo[rownames(cesc.sinfo),"PFI.time"]/30

# 提出鳞癌
cesc.sinfo <- cesc.sinfo[grepl("Squamous",cesc.sinfo$histology),] # 245

# 提取共有样本
comsam <- intersect(rownames(cesc.sinfo),colnames(tpm)) # 243
tcga.expr <- tpm[,comsam]
tcga.sinfo <- cesc.sinfo[comsam,]
write.table(tcga.expr,"tcga.expr.matched.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tcga.sinfo,"tcga.sinfo.matched.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

# 2. GEO队列的表达谱数据（数据匹配以及临床数据整理）
load("CESC-GSE.Rdata")

## GSE44001
gse44001.expr <- as.data.frame(impute.knn(as.matrix(GSE44001_dat))$data) # 300
identical(colnames(gse44001.expr), rownames(GSE44001_pd))
gse44001.sinfo <- GSE44001_pd[,c(35,36,37,34)]
colnames(gse44001.sinfo) <- c("largest_diameter","pstage","DFS","DFS.time")
write.table(gse44001.expr,"gse44001.expr.matched.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(gse44001.sinfo,"gse44001.sinfo.matched.txt",sep = "\t",row.names = T,col.names = NA,quote = F)


batchPCA =function(indata, batch, fig.dir, PCA.fig.title, pos="bottomright", xy=c(1,2), cols=NULL, showID=FALSE, cex=1, showLegend=T) {
# indata is a data matrix with samples in columns and genes in rows.
# batch is a vector with the order matching the order in indata.
    library(ClassDiscovery)
    
    outfile = file.path(fig.dir, paste(PCA.fig.title, ".pdf",sep=""))
    N.batch = length(unique(batch))    
    if (is.null(cols)) { 
      cols <- rainbow(N.batch) 
    }else{
      if (length(cols) != N.batch) {stop("cols length not equal to batch length")}
    }           
    
    indata=na.omit(indata)
    pca<-SamplePCA(indata, usecor=F, center=T)
    pct1 <- round (pca@variances[xy[1]]/sum(pca@variances), digits=3)*100
    pct2 <- round (pca@variances[xy[2]]/sum(pca@variances), digits=3)*100
    xlab.text = paste("Comp ", xy[1], ": ", as.character(pct1), "% variance", sep="")
    ylab.text = paste("Comp ", xy[2], ": ", as.character(pct2), "% variance", sep="")    
    
    pdf(file=outfile)
    plot(pca@scores[,xy[1]], pca@scores[,xy[2]],  cex=0.7, xlab=xlab.text, ylab=ylab.text, col=cols[factor(batch)], pch=(1:N.batch)[factor(batch)],lwd=1.5, main=PCA.fig.title)
    abline(h=0, v=0, col="brown", lty=2)
    abline(h=0, v=0, col="brown", lty=2)
    center1<-tapply(pca@scores[,xy[1]], factor(batch), mean)
    center2<-tapply(pca@scores[,xy[2]], factor(batch), mean)
    for (ii in 1:length(center1)) {
        groupi<-pca@scores[as.numeric(factor(batch))==ii, xy]
        #  print(paste("Cluster", ii))
        if (class(groupi)=="matrix") {
            for (j in (1:nrow(groupi))) {
                segments( groupi[j,1], groupi[j,2], center1[ii], center2[ii], col=cols[ii] , lwd=0.3)
            }
        }else {
            segments( groupi[1], groupi[2], center1[ii], center2[ii], col=cols[ii] , lwd=0.3)
        }
    }
    points(center1, center2, pch=7, lwd=1.5,col=cols)
    if (showID) {
      text(pca@scores[,xy[1]], pca@scores[,xy[2]], colnames(indata), lwd=1, cex=cex)
    }
    if(showLegend){
      legend(pos,legend=names(table(factor(batch))), text.col=cols, pch=(1:N.batch), col=cols, lty=1)
    }
    invisible(dev.off())
}




#Survival analysis
library(survival)
rt=read.table("expTime.txt",header=T,sep="\t",check.names=F,row.names=1)     
outTab=data.frame()
sigGenes=c("futime","fustat")
for(i in colnames(rt[,3:ncol(rt)])){
cox <- coxph(Surv(futime, fustat) ~ rt[,i], data = rt)
coxSummary = summary(cox)
coxP=coxSummary$coefficients[,"Pr(>|z|)"]
if(coxP<0.05){
sigGenes=c(sigGenes,i)
outTab=rbind(outTab,
cbind(id=i,
HR=coxSummary$conf.int[,"exp(coef)"],
HR.95L=coxSummary$conf.int[,"lower .95"],
HR.95H=coxSummary$conf.int[,"upper .95"],
pvalue=coxSummary$coefficients[,"Pr(>|z|)"]))}}
write.table(outTab,file="uniCox.txt",sep="\t",row.names=F,quote=F)
uniSigExp=rt[,sigGenes]
uniSigExp=cbind(id=row.names(uniSigExp),uniSigExp)
write.table(uniSigExp,file="uniSigExp.txt",sep="\t",row.names=F,quote=F)
#train
rt<-read.table("uniSigExp.txt",head=T,sep='\t',check.names = F,row.names = 1)
x=as.matrix(rt[,c(3:ncol(rt))])
y=data.matrix(Surv(rt$futime,rt$fustat))
fit <- glmnet(x, y, family = "cox")
pdf("lambda.pdf")
plot(fit, xvar = "lambda", label = TRUE,lwd=3)
dev.off()
cvfit <- cv.glmnet(x, y, family="cox", nfolds=10)
pdf("cvfit.pdf")
plot(cvfit)
abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)),lty="dashed")
dev.off()
coef <- coef(fit, s = cvfit$lambda.min)
index <- which(coef != 0)
actCoef <- coef[index]
lassoGene=row.names(coef)[index]
geneCoef=cbind(Gene=lassoGene,Coef=actCoef)
write.table(geneCoef,file="geneCoef.txt",sep="\t",quote=F,row.names=F)
riskScore=predict(cvfit, newx = x, s = "lambda.min",type="response")
outCol=c("futime","fustat",lassoGene)
risk=as.vector(ifelse(riskScore>median(riskScore),"high","low"))
outTab=cbind(rt[,outCol],riskScore=as.vector(riskScore),risk)
write.table(cbind(id=rownames(outTab),outTab),file="trainRisk.txt",sep="\t",quote=F,row.names=F)
#Validation
z<-read.table("expTime.txt",head=T,sep='\t',check.names = F,row.names = 1)
s=as.matrix(z[,c(3:ncol(z))])
riskScore=predict(cvfit, newx = s, s = "lambda.min",type="response")
outCol=c("futime","fustat",lassoGene)
risk=as.vector(ifelse(riskScore>median(riskScore),"high","low"))
outTab=cbind(z[,outCol],riskScore=as.vector(riskScore),risk)
write.table(cbind(id=rownames(outTab),outTab),file="testRisk.txt",sep="\t",quote=F,row.names=F)

#K-M curve
#train
library(survival)
library(survminer)
rt<-read.table("trainRisk.txt",head=T,sep='\t',check.names = F,row.names = 1)
length=length(levels(factor(rt$risk)))
diff=survdiff(Surv(futime, fustat) ~ risk, data = rt)
pValue=1-pchisq(diff$chisq, df=length-1)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
ggsurvplot(fit,
data=rt,
conf.int=T,
pval=paste0("P < 0.001"),
pval.size=6,
legend.title="risk",
legend.labs=levels(factor(rt[,"risk"])),
legend = c(0.8, 0.8),font.legend=10,xlab="Time(years)",break.time.by = 1,palette = "d3",surv.median.line = "hv",risk.table=T,cumevents=F,risk.table.height=.35)

#validation
library(survival)
library(survminer)
rt<-read.table("testRisk.txt",head=T,sep='\t',check.names = F,row.names = 1)
length=length(levels(factor(rt$risk)))
diff=survdiff(Surv(futime, fustat) ~ risk, data = rt)
pValue=1-pchisq(diff$chisq, df=length-1)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
ggsurvplot(fit,
data=rt,
conf.int=T,
pval=paste0("P < 0.001"),
pval.size=6,
legend.title="risk",
legend.labs=levels(factor(rt[,"risk"])),
legend = c(0.8, 0.8),font.legend=10,xlab="Time(years)",break.time.by = 1,palette = "d3",surv.median.line = "hv",risk.table=T,cumevents=F,risk.table.height=.35)

#ROC
rt=read.table("trainRisk.txt",header=T,sep="\t",check.names=F,row.names=1)
rocCol=c("#1B9E77", "#D95F02", "#7570B3")
aucText=c()
#5-year
pdf(file="ROC.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =5, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[1],
xlab="False positive rate", ylab="True positive rate",
lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
aucText=c(aucText,paste0("five year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
abline(0,1)
#3-year
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =3, method="KM")
aucText=c(aucText,paste0("three year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
lines(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[2],lwd = 2)
#1-year
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =1, method="KM")
aucText=c(aucText,paste0("one year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
lines(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[3],lwd = 2)
legend("bottomright", aucText,lwd=2,bty="n",col=rocCol)
dev.off()


rt=read.table("testRisk.txt",header=T,sep="\t",check.names=F,row.names=1)
rocCol=c("#1B9E77", "#D95F02", "#7570B3")
aucText=c()
#5-year
pdf(file="ROC.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =5, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[1],
xlab="False positive rate", ylab="True positive rate",
lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
aucText=c(aucText,paste0("five year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
abline(0,1)
#3-year
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =3, method="KM")
aucText=c(aucText,paste0("three year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
lines(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[2],lwd = 2)
#1-year
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, predict.time =1, method="KM")
aucText=c(aucText,paste0("one year"," (AUC=",sprintf("%.3f",roc$AUC),")"))
lines(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col=rocCol[3],lwd = 2)
legend("bottomright", aucText,lwd=2,bty="n",col=rocCol)
dev.off()

#Riskscore Plot
#train
library(pheatmap)
rt=read.table("trainRisk.txt",sep="\t",header=T,row.names=1,check.names=F)       
rt=rt[order(rt$riskScore),]                                     

riskClass=rt[,"risk"]
lowLength=length(riskClass[riskClass=="low"])
highLength=length(riskClass[riskClass=="high"])
line=rt[,"riskScore"]
line[line>10]=10
pdf(file="riskScore.pdf",width = 10,height = 4)
plot(line,
     type="p",
     pch=20,
     xlab="Patients (increasing risk socre)",
     ylab="Risk score",
     col=c(rep("#4DAF4A",lowLength),
     rep("#E41A1C",highLength)))
abline(h=median(rt$riskScore),v=lowLength,lty=2)
legend("topleft", c("High risk", "low Risk"),bty="n",pch=19,col=c("#E41A1C","#4DAF4A"),cex=1.2)
dev.off()


color=as.vector(rt$fustat)
color[color==1]="#E41A1C"
color[color==0]="#4DAF4A"
pdf(file="survStat.pdf",width = 10,height = 4)
plot(rt$futime,
     pch=19,
     xlab="Patients (increasing risk socre)",
     ylab="Survival time (years)",
     col=color)
legend("topleft", c("Dead", "Alive"),bty="n",pch=19,col=c("#E41A1C","#4DAF4A"),cex=1.2)
abline(v=lowLength,lty=2)
dev.off()


rt1=rt[c(3:(ncol(rt)-2))]
plotdata <- t(scale(rt1)))
plotdata[plotdata > 3] <- 3 
plotdata[plotdata < -3] <- -3
annotation=data.frame(type=rt[,ncol(rt)])
rownames(annotation)=rownames(rt)
pdf(file="heatmap.pdf",width = 10,height = 4)
pheatmap(plotdata, 
         annotation=annotation, 
         cluster_cols = FALSE,
         fontsize_row=11,
         show_colnames = F,
         fontsize_col=3,
         color = colorRampPalette(c("#343493", "white", "#C24A45"))(64))
dev.off()

#validation

library(pheatmap)
rt=read.table("testRisk.txt",sep="\t",header=T,row.names=1,check.names=F)       
rt=rt[order(rt$riskScore),]                                     

riskClass=rt[,"risk"]
lowLength=length(riskClass[riskClass=="low"])
highLength=length(riskClass[riskClass=="high"])
line=rt[,"riskScore"]
line[line>10]=10
pdf(file="riskScore.pdf",width = 10,height = 4)
plot(line,
     type="p",
     pch=20,
     xlab="Patients (increasing risk socre)",
     ylab="Risk score",
     col=c(rep("#4DAF4A",lowLength),
     rep("#E41A1C",highLength)))
abline(h=median(rt$riskScore),v=lowLength,lty=2)
legend("topleft", c("High risk", "low Risk"),bty="n",pch=19,col=c("#E41A1C","#4DAF4A"),cex=1.2)
dev.off()


color=as.vector(rt$fustat)
color[color==1]="#E41A1C"
color[color==0]="#4DAF4A"
pdf(file="survStat.pdf",width = 10,height = 4)
plot(rt$futime,
     pch=19,
     xlab="Patients (increasing risk socre)",
     ylab="Survival time (years)",
     col=color)
legend("topleft", c("Dead", "Alive"),bty="n",pch=19,col=c("#E41A1C","#4DAF4A"),cex=1.2)
abline(v=lowLength,lty=2)
dev.off()


rt1=rt[c(3:(ncol(rt)-2))]
plotdata <- t(scale(rt1)))
plotdata[plotdata > 3] <- 3 
plotdata[plotdata < -3] <- -3
annotation=data.frame(type=rt[,ncol(rt)])
rownames(annotation)=rownames(rt)
pdf(file="heatmap.pdf",width = 10,height = 4)
pheatmap(plotdata, 
         annotation=annotation, 
         cluster_cols = FALSE,
         fontsize_row=11,
         show_colnames = F,
         fontsize_col=3,
         color = colorRampPalette(c("#343493", "white", "#C24A45"))(64))
dev.off()

#Nomogram-Independence

multiCox<-read.table("multiCox.xls",head=T,sep='\t',check.names = F,stringsAsFactors = F)

unicox<-read.table("uniCox.txt",head=T,sep='\t',check.names = F,stringsAsFactors = F)
hrtable <- rbind(c("Univariate cox regression analysis",NA,NA,NA,NA),
                 unicox,
                 c("Multivariate cox regression analysis",NA,NA,NA,NA),
                 multiCox)

tabletext <- cbind(c("Variable",hrtable$id),
                   c("HR",format(round(as.numeric(hrtable$HR),3),nsmall = 3)),
                   c("lower 95%CI",format(round(as.numeric(hrtable$HR.95L),3),nsmall = 3)),
                   c("upper 95%CI",format(round(as.numeric(hrtable$HR.95H),3),nsmall = 3)),
                   c("pvalue",formatC(as.numeric(hrtable$pvalue), format = "e", digits = 2)))


tabletext[2,] <- c("Univariate cox regression analysis",NA,NA,NA,NA) 
tabletext[7,] <- c("Multivariate cox regression analysis",NA,NA,NA,NA) 


forestplot(labeltext=tabletext,
           mean=c(NA,log2(as.numeric(hrtable$HR))),
           lower=c(NA,log2(as.numeric(hrtable$HR.95L))), 
           upper=c(NA,log2(as.numeric(hrtable$HR.95H))),
           graph.pos=6,
           graphwidth = unit(.35,"npc"),
           fn.ci_norm="fpDrawDiamondCI",
           col=fpColors(box="#EF8A62", lines="#67A9CF", zero = "black"),
           boxsize=0.3,
           lwd.ci=1,
           ci.vertices.height = 0.1,ci.vertices=F,
           zero=1,
           lwd.zero=3,
           xticks = c(0,1,2,3),
           lwd.xaxis=2,
           xlab=expression("HR"),
           hrzl_lines=list("1" = gpar(lwd=2, col="black"),
                           "2" = gpar(lwd=1, col="grey50", lty=2),
                           "7" = gpar(lwd=1, col="grey50", lty=2),
                           "13" = gpar(lwd=2, col="black")),
           txt_gp=fpTxtGp(label=gpar(cex=1.2),
                          ticks=gpar(cex=0.85),
                          xlab=gpar(cex=1),
                          title=gpar(cex=1.5)),
           lineheight = unit(1.5,"cm"),
           colgap = unit(0.3,"cm"),
           mar=unit(rep(1.5, times = 4), "cm"),
           new_page = F
)

pbc<-read.table("Nomogram.txt",head=T,sep='\t',check.names = F,row.names = 1)

pbc$died <- pbc$fustat==1

head(pbc)
library(rms)

dd<-datadist(pbc)
options(datadist="dd")
options(na.action="na.delete")
summary(pbc$futime)
coxpbc<-cph(formula = Surv(futime,died) ~  Age + riskScore  ,data=pbc,x=T,y=T,surv = T,na.action=na.delete)  #,time.inc =2920

print(coxpbc)
surv<-Survival(coxpbc) 
surv3<-function(x) surv(1095,x)
surv1<-function(x) surv(365,x)
surv5<-function(x) surv(1825,x)
x<-nomogram(coxpbc,fun = list(surv1,surv3,surv5),lp=T,
            funlabel = c('1-year survival Probability','3-year survival Probability','5-year survival Probability'),
            maxscale = 100,fun.at = c(0.95,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1))

pdf("nomogram_classical.pdf",width = 12,height = 10)
plot(x, lplabel="Linear Predictor",
     xfrac=.35,varname.label=TRUE, varname.label.sep="=", ia.space=.2, 
     tck=NA, tcl=-0.20, lmgp=0.3,
     points.label='Points', total.points.label='Total Points',
     total.sep.page=FALSE, 
     cap.labels=FALSE,cex.var = 1.6,cex.axis = 1.05,lwd=5,
     label.every = 1,col.grid = gray(c(0.8, 0.95)))
dev.off()

#1-year
cox1 <- cph(Surv(futime,fustat) ~ Age + riskScore ,surv=T,x=T, y=T,time.inc = 1*365,data=pbc)
cal1<- calibrate(cox1, cmethod="KM", method="boot", u=1*365, m= 80, B=1000)

pdf("calibrate1.pdf")
plot(cal1,lwd=2,lty=1,errbar.col="black",xlim = c(0,1),ylim = c(0,1),xlab ="Nomogram-Predicted Probability of 1-Year Survival",ylab="Actual 1-Year Survival",col="blue",sub=F)
mtext("")
box(lwd = 0.5)
abline(0,1,lty = 3,lwd = 2,col = "black")
dev.off()


#3-year
cox3 <- cph(Surv(futime,fustat) ~ Age + riskScore ,surv=T,x=T, y=T,time.inc = 1*365*3,data=pbc)
cal3 <- calibrate(cox3, cmethod="KM", method="boot", u=1*365*3, m= 80, B=1000)

pdf("calibrate3.pdf")
plot(cal3,lwd=2,lty=1,errbar.col="black",xlim = c(0,1),ylim = c(0,1),xlab ="Nomogram-Predicted Probability of 3-Year Survival",ylab="Actual 3-Year Survival",col="blue",sub=F)
mtext("")
box(lwd = 0.5)
abline(0,1,lty = 3,lwd = 2,col = "black")
dev.off()      

#5-year
cox5 <- cph(Surv(futime,fustat) ~ Age + riskScore ,surv=T,x=T, y=T,time.inc = 1*365*5,data=pbc)
cal5 <- calibrate(cox5, cmethod="KM", method="boot", u=1*365*5, m= 80, B=1000)

pdf("calibrate5.pdf")
plot(cal5,lwd=2,lty=1,errbar.col="black",xlim = c(0,1),ylim = c(0,1),xlab ="Nomogram-Predicted Probability of 5-Year Survival",ylab="Actual 5-Year Survival",col="blue",sub=F)
mtext("")
box(lwd = 0.5)
abline(0,1,lty = 3,lwd = 2,col = "black")
dev.off()      
#GSEA
library(plyr)
library(ggplot2)
library(grid)
library(gridExtra)
setwd("D:\\biowolf\\metabolism\\24.multipleGSEA")          
files=grep(".xls",dir(),value=T)                                        
data = lapply(files,read.delim)                                          
names(data) = files
dataSet = ldply(data, data.frame)
dataSet$pathway = gsub(".xls","",dataSet$.id)                            
gseaCol=c("#E41A1C", "#377EB8" ,"#4DAF4A" ,"#984EA3" ,"#FF7F00", "#FFFF33", "#A65628")
pGsea=ggplot(dataSet,aes(x=RANK.IN.GENE.LIST,y=RUNNING.ES,fill=pathway,group=pathway))+
geom_point(shape=21) + scale_fill_manual(values = gseaCol[1:nrow(dataSet)]) +
labs(x = "", y = "Enrichment Score", title = "") + scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0),limits =c(min(dataSet$RUNNING.ES-0.02), max(dataSet$RUNNING.ES+0.02))) +
theme_bw() + theme(panel.grid =element_blank()) + theme(panel.border = element_blank()) +
theme(axis.line = element_line(colour = "black")) + theme(axis.line.x = element_blank(),axis.ticks.x = element_blank(),axis.text.x = element_blank()) +
geom_hline(yintercept = 0) + guides(fill=guide_legend(title = NULL)) +
theme(legend.background = element_blank()) + theme(legend.key = element_blank())
pGene=ggplot(dataSet,aes(RANK.IN.GENE.LIST,pathway,colour=pathway))+geom_tile()+
scale_color_manual(values = gseaCol[1:nrow(dataSet)]) +
labs(x = "High risk<----------->Low risk", y = "", title = "") +
scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) +
theme_bw() + theme(panel.grid = element_blank()) + theme(panel.border = element_blank()) + theme(axis.line = element_line(colour = "black"))+
theme(axis.line.y = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+ guides(color=FALSE)
gGsea = ggplot_gtable(ggplot_build(pGsea))
gGene = ggplot_gtable(ggplot_build(pGene))
maxWidth = grid::unit.pmax(gGsea$widths, gGene$widths)
gGsea$widths = as.list(maxWidth)
gGene$widths = as.list(maxWidth)
dev.off()
pdf('multipleGSEA.pdf',      
width=9,                
height=5)              
par(mar=c(5,5,2,5))
grid.arrange(arrangeGrob(gGsea,gGene,nrow=2,heights=c(.8,.3)))
dev.off()

#Immune cell enrichment
inputFile="uniq.symbol.txt"                                        
gmtFile="immune.gmt"                                          
library(GSVA)
library(limma)
library(GSEABase)
mat<-read.table("uniq.symbol.txt",head=T,sep='\t',check.names = F,row.names = 1)
mat<-as.matrix(mat)
geneSet=getGmt(gmtFile,
geneIdType=SymbolIdentifier())
ssgseaScore=gsva(mat, geneSet, method='ssgsea', kcdf='Gaussian', abs.ranking=TRUE)
write.table(t(ssgseaScore),file="ssgseaOut.txt",sep="\t",quote=F)
#Correlation
dat<-read.table("Immune_cell.txt",head=T,sep='\t',check.names = F,row.names = 1)
head(dat)
library(corrplot)
M = cor(mtcars)
M = cor(dat)
corrplot(M, order = 'AOE', addCoef.col = 'black', tl.pos = 'd',
cl.pos = 'n', col = brewer.pal(n = 10, name = 'PRGn'))

#Immune checkpoint
gene<-read.table("immune_checkpoint.txt",head=F,sep='\t',check.names = F)
head(gene)
dim(dat)
dim(mat)
gene_expr<-mat[intersect(gene$V1,row.names(mat)),]
dim(gene_expr)
write.table(t(gene_expr),"immune_checkpoint.txt",quote=F,sep='\t')

df<-read.table("immune_checkpoint.txt",head=T,sep='\t',check.names = F,row.names = 1)
data<-melt(df,
           id.vars = c('Risk'),
           measure.vars = colnames(df[-1]),
           variable.name='Immune_checkpoint',
           value.name='Expression')
p=ggboxplot(data, x="Immune_checkpoint", y="Expression", fill = "Risk",alpha=0.8,orientation = "horizontal",notch = TRUE,
            ylab="Expression",
            xlab="",
            palette = c("lancet"))
p=p+rotate_x_text(60)                    
p+stat_compare_means(aes(group=Risk),symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", " ")),label = "p.signif")+theme(axis.text = element_text(size = 13, face = "bold"),axis.title = element_text(size = 13, face = "bold"))

#HLA
gene<-read.table("gene.txt",head=F,sep='\t',check.names = F)
gene_expr<-mat[intersect(gene$V1,row.names(mat)),]
gene_expr[1:4,1:4]
write.table(t(gene_expr),"HLA_expr.txt",quote=F,sep='\t')


df<-read.table("HLA_expr.txt",head=T,sep='\t',check.names = F,row.names = 1)
data<-melt(df,
           id.vars = c('Risk'),
           measure.vars = colnames(df[-1]),
           variable.name='HLA_expr',
           value.name='Expression')
p=ggboxplot(data, x="HLA_expr", y="Expression", fill = "Risk",alpha=0.8,orientation = "horizontal",notch = TRUE,
            ylab="Expression",
            xlab="",
            palette = c("lancet"))
p=p+rotate_x_text(60)                    
p+stat_compare_means(aes(group=Risk),symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", " ")),label = "p.signif")+theme(axis.text = element_text(size = 13, face = "bold"),axis.title = element_text(size = 13, face = "bold"))

#Chemo_drug
library(pRRophetic)
library(ggplot2)
library(cowplot)
dat <- read.table("uniq.symbol.txt",sep = "\t",row.names = 1,header = T,stringsAsFactors = F,check.names = F)
ann <- read.table("trainRisk.txt",sep = "\t",row.names = 1,header = T,stringsAsFactors = F,check.names = F)
head(ann)
dat[1:4,1:4]
dat<-dat[,intersect(row.names(ann),colnames(dat))]
dim(dat)
GCP.drug <- read.table("drug.txt") 
GCP.drug <- GCP.drug$V1
GCP.drug
brewer.pal("Set1",n=3)
jco <- c("#E41A1C" "#377EB8")
jco <- c("#E41A1C", "#377EB8")
GCPinfo <- GCP.IC50 <- GCP.expr <- cvOut <- predictedPtype <- predictedBoxdat <- list() 
plotp <- list()
for (drug in GCP.drug) {
set.seed(1248103)
cat(drug," starts!\n") 

predictedPtype[[drug]] <- pRRopheticPredict(testMatrix = as.matrix(dat[,rownames(ann)]),
drug = drug,tissueType = "allSolidTumors",selection = 1) 
if(!all(names(predictedPtype[[drug]])==rownames(ann))) {stop("Name mismatched!\n")} 
predictedBoxdat[[drug]] <- data.frame("est.ic50"=predictedPtype[[drug]],"risk"=ann$risk, row.names = names(predictedPtype[[drug]]))
predictedBoxdat[[drug]]$risk <- factor(predictedBoxdat[[drug]]$risk,levels = c("high","low"),ordered = T) 

p <- ggplot(data = predictedBoxdat[[drug]], aes(x=risk, y=est.ic50))
p <- p + geom_boxplot(aes(fill = risk)) +
scale_fill_manual(values = jco[1:length(unique(ann$risk))]) + 
theme(legend.position="none") + 
theme(axis.text.x = element_text(angle = 45, hjust = 1,size = 12),plot.title = element_text(size = 12, hjust = 0.5)) +
xlab("") + ylab("Estimated IC50") +
ggtitle(drug) 
plotp[[drug]] <- p 
cat(drug," has been finished!\n") 
}
p2 <- plot_grid(plotlist=plotp, ncol=3)
ggsave("boxplot of predicted IC50_multiple.pdf", width = 8, height = 6)
p <- vector()
for (drug in GCP.drug) {
tmp <- wilcox.test(as.numeric(predictedBoxdat[[drug]][which(predictedBoxdat[[drug]]$ImmClust %in% "HPV16-IMM"),"est.ic50"]),
as.numeric(predictedBoxdat[[drug]][which(predictedBoxdat[[drug]]$ImmClust %in% "HPV16-KRT"),"est.ic50"]))$p.value
p <- append(p,tmp) 
}
names(p) <- GCP.drug
print(p) 
p <- vector()
for (drug in GCP.drug) {
tmp <- wilcox.test(as.numeric(predictedBoxdat[[drug]][which(predictedBoxdat[[drug]]$risk %in% "high"),"est.ic50"]),
as.numeric(predictedBoxdat[[drug]][which(predictedBoxdat[[drug]]$risk %in% "low"),"est.ic50"]))$p.value
p <- append(p,tmp) 
}
names(p) <- GCP.drug
print(p) 
write.table(p,"output_pvalue.txt", quote = F, sep = "\t")
#CMAP drug
dat<-read.table("uniq.symbol.txt",head=T,sep='\t',check.names = F,row.names = 1)
logFoldChange=1               
adjustP=0.05                   
conNum=189                      
treatNum=188
library(limma)
rt<-dat
rt<-log2(rt+1)
range(rt)
modType=c(rep("con",conNum),rep("treat",treatNum))
design <- model.matrix(~0+factor(modType))
colnames(design) <- c("con","treat")
fit <- lmFit(rt,design)
cont.matrix<-makeContrasts(treat-con,levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
allDiff=topTable(fit2,adjust='fdr',number=200000)
write.table(allDiff,file="mrnaAll.xls",sep="\t",quote=F)
risk<-read.table("trainRisk.txt",head=T,sep='\t',check.names = F,row.names = 1)
head(risk)
rt<-rt[,intersect(row.names(risk),colnames(rt))]
modType=c(rep("low",conNum),rep("high",treatNum))
design <- model.matrix(~0+factor(modType))
colnames(design) <- c("low","high")
fit <- lmFit(rt,design)
cont.matrix<-makeContrasts(high-low,levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
allDiff=topTable(fit2,adjust='fdr',number=200000)
write.table(allDiff,file="mrnaAll.xls",sep="\t",quote=F)
diffSig <- allDiff[with(allDiff, (abs(logFC)>logFoldChange & adj.P.Val < adjustP )), ]
diffSigOut=rbind(id=colnames(diffSig),diffSig)
write.table(diffSigOut,file="mrnaDiff.xls",sep="\t",quote=F,col.names=F)
write.table(diffSigOut,file="mrnaDiff.txt",sep="\t",quote=F,col.names=F)
diffSig <- allDiff[with(allDiff, (abs(logFC)>0.58 & adj.P.Val < adjustP )), ]
diffSigOut=rbind(id=colnames(diffSig),diffSig)
write.table(diffSigOut,file="mrnaDiff.xls",sep="\t",quote=F,col.names=F)
write.table(diffSigOut,file="mrnaDiff.txt",sep="\t",quote=F,col.names=F)
library(xlsx)
library(tidyverse)
library(GEOquery)
library(plyr)
library(circlize)
library(ComplexHeatmap)
options(java.parameters = "-Xmx8000m")
Sys.setenv(LANGUAGE = "en") 
options(stringsAsFactors = FALSE) 
MoAinput <- openxlsx::read.xlsx("MOA.xlsx", sheet = 34, colNames = T)

MoAinput[MoAinput$MoA == "NFkB pathway inhibitor", ]
MoAinput <- openxlsx::read.xlsx("MOA.xlsx", sheet = 2, colNames = T)
MoAinput[MoAinput$MoA == "NFkB pathway inhibitor", ]
PerturbagenID <- unlist(str_split(MoAinput$Name, ", "))
names(PerturbagenID) <- unlist(str_split(MoAinput$Perturbagen.Id, ", "))
MoAinput <- MoAinput[, c("MoA", "Perturbagen.Id")] %>% split(.$MoA) %>% lapply("[[", 2) %>%
lapply(., function(x)unlist(str_split(x, ", "))) %>% plyr::ldply(., data.frame)
colnames(MoAinput) <- c("mechanisms of action", "inhibitors")
oncoprintinput <- reshape2::dcast(MoAinput, `mechanisms of action` ~ inhibitors)
rownames(oncoprintinput) <- oncoprintinput$`mechanisms of action`
oncoprintinput <- oncoprintinput[, -1] %>% as.matrix(oncoprintinput)
oncoprintinput[!is.na(oncoprintinput)] <- "inhibitor"
oncoprintinput[is.na(oncoprintinput)] <- ""
colnames(oncoprintinput) <- PerturbagenID[colnames(oncoprintinput)]
oncoprintinput <- oncoprintinput[, order(colnames(oncoprintinput))]
help("oncoPrint")
alter_fun = list(
background = function(x, y, w, h)
grid.rect(x, y, w*0.9, h*0.9, gp = gpar(fill = "white", col = "grey")),
# dots
inhibitor = function(x, y, w, h)
grid.points(x, y, pch = 16, size = unit(0.8, "char"))
)
ha_coldata <- colSums(apply(oncoprintinput, 2, function(x) x=="inhibitor") + 0) %>% as.numeric()
ha_rowdata <- rowSums(apply(oncoprintinput, 2, function(x) x=="inhibitor") + 0) %>% as.numeric()
top_ha <- HeatmapAnnotation(inhibitors = anno_barplot(ha_coldata, axis = F, border = F,
gp = gpar(fill = "grey"),
bar_width = 1),
annotation_name_side = "left",
annotation_name_rot = 90)
right_ha <- rowAnnotation(count = anno_barplot(ha_rowdata, axis = F, border = F,
gp = gpar(fill = "grey"),
bar_width = 1, width = unit(1.5, "cm")),
annotation_name_side = "top",
annotation_name_rot = 0)
pdf("MoA.pdf", width = 10, height = 8, onefile = F)
oncoPrint(oncoprintinput, alter_fun = alter_fun,
show_column_names = TRUE, column_names_side = "top",
column_order = 1:ncol(oncoprintinput),
top_annotation = top_ha,
right_annotation = right_ha,col="blue",
show_pct = FALSE, show_heatmap_legend = F)
decorate_annotation("inhibitors", {
grid.text("mechanism of action", unit(1, "npc") + unit(3, "mm"), just = "left")})
dev.off()
#GSVA
dat<-read.table("trainRisk.txt",head=T,sep='\t',check.names = F,row.names = 1)
risk<-read.table("trainRisk.txt",head=T,sep='\t',check.names = F,row.names = 1)
dat<-read.table("uniq.symbol.txt",head=T,sep='\t',check.names = F,row.names = 1)
sample<-intersect(row.names(risk),colnames(dat))
dat<-dat[,sample]
dim(dat)
dat[1:4,1:4]
dim(risk)
library(GSVA)
library(limma)
library(stringr)
library(ggplot2)
Sys.setenv(LANGUAGE = "en") 
options(stringsAsFactors = FALSE) 
(load("hallmark.gs.RData"))
gsva_es <- gsva(as.matrix(dat), gs)
gsva_es[1:4,1:4]
write.csv(gsva_es, "gsva_output.csv", quote = F)
table(risk$risk)
Type=c(rep("high",188),rep("low",189))
names(Type)=colnames(gsva_es)
Type=as.data.frame(Type)
pheatmap(gsva_es,
annotation=Type,
color = colorRampPalette(c("green", "black", "red"))(50),
cluster_cols =F,
show_colnames = F,
scale="row",
fontsize = 12,
fontsize_row=12,
fontsize_col=10,gaps_col = c(188,188))
#DSS-PFI
library(survival)
library("survminer")
rt=read.table("PFI.txt",header=T,sep="\t");rt$PFI.time=rt$PFI.time/365
diff=survdiff(Surv(PFI.time, PFI) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(PFI.time, PFI) ~ risk, data = rt)
ggsurvplot(fit,
data=rt,
conf.int=T,pval=paste0("P = ",pValue),pval.size=6,legend.title="risk",
legend.labs=levels(factor(rt[,"risk"])),
legend = c(0.8, 0.8),font.legend=10,xlab="Time(years)",break.time.by = 1,palette = "Set1",surv.median.line = "hv",risk.table=T,cumevents=F,risk.table.height=.30)

rt=read.table("DSS.txt",header=T,sep="\t");rt$DSS.time=rt$DSS.time/365
diff=survdiff(Surv(DSS.time, DSS) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(DSS.time, DSS) ~ risk, data = rt)
ggsurvplot(fit,
data=rt,
conf.int=T,
pval=pValue,
pval.size=6,
legend.title="risk",
legend.labs=levels(factor(rt[,"risk"])),
legend = c(0.8, 0.8),
font.legend=10,
xlab="Time(years)",
break.time.by = 1,
palette = "Set1",
surv.median.line = "hv",
risk.table=T,
cumevents=F,
risk.table.height=.30)
